draw_pie_chart <- function(data.df, column) {
data_counts.df <- data.df %>%
dplyr::select(id, {{column}}) %>%
distinct() %>%
group_by({{column}}) %>%
summarise(counts = n()) %>%
arrange(desc(counts)) %>%
mutate(percentage = percent(counts / sum(counts)))
pie <- ggplot(data_counts.df, aes(x="", y=counts, fill={{column}})) +
geom_bar(stat = "identity", width = 0.05, color="black") +
coord_polar("y", start=0) +
scale_fill_brewer(palette = "Set3", direction = -4) +
geom_label_repel(aes(label = percentage), size=3.5, show.legend = F, position = position_stack(vjust = .5)) +
guides(fill = guide_legend(title = "Stem type")) +
ggtitle(paste0("Stem types - ", sum(data_counts.df$counts)," duplexes")) +
theme_void() # remove background, grid, numeric labels
return(pie)
}
# get_stem_length <- function(data.df) {
# data_sh.df <- data.df %>%
# group_by(id) %>%
# dplyr::filter(element == "s0" | element == "h0") %>% #use these positions to estimate stem length
# mutate(stem_length_L = L_start - lag(L_start),
# stem_length_R = lag(R_end) - L_end) %>%
# dplyr::filter(element == "h0") %>%
# dplyr::select(id, stem_length_L, stem_length_R)
# data.df <- left_join(data.df, data_sh.df, by = "id")
# return(data.df)
# }
# get_hairpin_length <- function(data.df) {
# data_h.df <- data.df %>%
# group_by(id) %>%
# dplyr::filter(element == "h0") %>%
# mutate(hairpin_length = sum(L_width)) %>%
# dplyr::select(id, hairpin_length)
# data.df <- left_join(data.df, data_h.df, by = "id")
# return(data.df)
# }
get_paired_regions <- function(data.df) {
duplex_max.df <- data.df %>%
group_by(id) %>%
dplyr::filter(element_type == "s") %>%
summarise(max_duplex = max(L_width),
sum_paired = sum(L_width)) %>%
dplyr::select(id, max_duplex, sum_paired)
data.df <- left_join(data.df, duplex_max.df)
}
count_element <- function(data.df, structure_type) {
counter <- data.df %>%
group_by(id) %>%
dplyr::filter(element_type == structure_type) %>%
summarise(count=n())
return(counter)
}
analyse_duplexes <- function(data.df) {
data.df <- get_paired_regions(data.df)
# data.df <- get_stem_length(data.df)
# data.df <- get_hairpin_length(data.df)
#data.df <- get_iloop_length_sum(data.df)
iloop_counts.df <- count_element(data.df, "i") %>%
dplyr::select(id, count) %>%
dplyr::rename(iloop_counts = count)
data.df <- left_join(data.df, iloop_counts.df, by = "id")
return(data.df)
}
get_id_features <- function(data.df) {
id_features.df <- data.df %>%
dplyr::select(id, max_duplex, sum_paired, iloop_counts) %>%
distinct() %>%
replace(is.na(.), 0)
return(id_features.df)
}plot_stacked_barchart <- function(data.df, column, group) {
data_counts.df <- data.df %>%
dplyr::select(name, {{group}}, {{column}}) %>%
group_by({{group}}, {{column}}) %>%
summarise(counts = n()) %>%
arrange(desc({{column}})) %>%
mutate(percentage = counts*100 / sum(counts)) %>%
mutate(pos = cumsum(percentage) - percentage/2)
bar = ggplot() + geom_bar(aes(y = percentage, x = "", fill = {{column}}), alpha = 0.8, data = data_counts.df,
stat="identity", width = 0.5) +
geom_text(data = data_counts.df, aes(x = "", y = pos, label = paste0(round(percentage,1),"%")), size=3) +
facet_grid(cols = vars({{group}})) +
#facet_wrap(. ~ Experiment, scales = "free") +
scale_fill_paletteer_d("rcartocolor::Earth", direction = -1) +
theme(legend.position="right", legend.direction="vertical",
legend.title = element_blank()) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
ylab("Percentage") +
xlab("Experiment")
return(bar)
}get_nucleotide_frequency <- function(duplexes.df) {
duplexes.df <- duplexes.df %>%
dplyr::filter(!is.na(l_paired_residues)) %>%
dplyr::filter(!is.na(r_paired_residues))
L_seq <- DNAStringSet(duplexes.df$l_paired_residues)
R_seq <- DNAStringSet(duplexes.df$r_paired_residues)
L_freq <- as.data.frame(oligonucleotideFrequency(L_seq, width = 1, as.prob=TRUE))
R_freq <- as.data.frame(oligonucleotideFrequency(R_seq, width = 1, as.prob=TRUE))
duplexes.df <- dplyr::bind_cols(duplexes.df, L_freq, R_freq)
colnames(duplexes.df) = gsub("...41|...42|...43|...44", "_L", colnames(duplexes.df)) # rename columns
colnames(duplexes.df) = gsub("...45|...46|...47|...48", "_R", colnames(duplexes.df))
# Pur content - calculate max pur L vs. R arm
duplexes.df <- duplexes.df %>%
rowwise() %>%
mutate(L_pur = sum(c(A_L, G_L)), # purine total per hybrid
R_pur = sum(c(A_R, G_R))) %>%
mutate(max_pur = max(L_pur, R_pur)) %>%
mutate(arm = case_when((max_pur == L_pur) ~ "L", # assign maximal purine arm
TRUE ~ "R")) %>%
ungroup()
# reorient duplex frequencies based on maximal purine content (column "arm")
nuc_freq_max_pur_arm.df <- duplexes.df %>%
dplyr::filter(arm == "L") %>%
mutate(A_freq_max_pur_arm = A_L, A_freq_min_pur_arm = A_R,
G_freq_max_pur_arm = G_L, G_freq_min_pur_arm = G_R,
C_freq_max_pur_arm = C_L, C_freq_min_pur_arm = C_R,
U_freq_max_pur_arm = T_L, U_freq_min_pur_arm = T_R)
nuc_freq_min_pur_arm.df <- duplexes.df %>%
dplyr::filter(arm == "R") %>%
mutate(A_freq_max_pur_arm = A_R, A_freq_min_pur_arm = A_L,
G_freq_max_pur_arm = G_R, G_freq_min_pur_arm = G_L,
C_freq_max_pur_arm = C_R, C_freq_min_pur_arm = C_L,
U_freq_max_pur_arm = T_R, U_freq_min_pur_arm = T_L)
nuc_freq_reordered.df <- rbind(nuc_freq_max_pur_arm.df, nuc_freq_min_pur_arm.df)
nuc_freq_reordered.df <- nuc_freq_reordered.df %>% arrange(desc(max_pur))
return(nuc_freq_reordered.df)
}plot_nucleotide_frequency <- function(data) {
data <- data %>%
dplyr::select(id, A_freq_max_pur_arm, G_freq_max_pur_arm, C_freq_max_pur_arm, U_freq_max_pur_arm,
A_freq_min_pur_arm, G_freq_min_pur_arm, C_freq_min_pur_arm, U_freq_min_pur_arm) %>%
dplyr::distinct() %>%
drop_na() %>%
dplyr::arrange(desc(G_freq_max_pur_arm))
# Prepare for plotting
high_pur.df <- data %>%
dplyr::select(id, A_freq_max_pur_arm, G_freq_max_pur_arm, C_freq_max_pur_arm, U_freq_max_pur_arm)
high_pur.df <- rowid_to_column(high_pur.df)
long_high_pur.df <- high_pur.df %>%
gather(residue, frequency, A_freq_max_pur_arm:U_freq_max_pur_arm)
long_high_pur.df$arm = "High purine arm"
low_pur.df <- data %>%
dplyr::select(id, A_freq_min_pur_arm, G_freq_min_pur_arm, C_freq_min_pur_arm, U_freq_min_pur_arm)
low_pur.df <- rowid_to_column(low_pur.df)
long_low_pur.df <- low_pur.df %>%
gather(residue, frequency, A_freq_min_pur_arm:U_freq_min_pur_arm)
long_low_pur.df$arm = "Low purine arm"
nuc_freq.df <- rbind(long_high_pur.df, long_low_pur.df)
nuc_freq.df$residue <- str_sub(nuc_freq.df$residue, 1,1) #trim string to get nucleotide (first) letter
# Plot individual nucleotide frequencies
nuc_freq.gg <- ggplot(nuc_freq.df, aes(x=rowid, y=frequency, color = residue)) +
geom_smooth(se=F) + coord_flip() +
facet_grid(cols = vars(arm)) +
ylab("Frequency") +
xlab("ID") +
ylim(0, 1) +
scale_color_manual(values = c("darkgreen","dodgerblue4","goldenrod3","firebrick"))
return(nuc_freq.gg)
}run_rnaeval <- function(fasta.file, args = c("-i", paste0(fasta.file,">temp.fa"))) {
rnaeval.out <- system2(command = "RNAeval", args = args, stdout = TRUE)
rnaeval.df <- as.data.frame(readBStringSet("temp.fa"))
rnaeval.df <- rnaeval.df %>% rownames_to_column(var = "id")
rnaeval.df <- rnaeval.df %>%
rowwise() %>%
mutate(eval_mfe = as.numeric(str_sub(x, -7,-2))) %>%
ungroup() %>%
dplyr::select(id, eval_mfe)
return(rnaeval.df)
}
get_mfe <- function(forgi, rnafold, prefix) {
data.df <- forgi %>%
dplyr::filter(element_type == "s") %>%
group_by(id) %>%
mutate(L_limit = min(L_start),
R_limit = max(R_end)) %>%
dplyr::select(id, L_limit, R_limit) %>%
distinct() %>%
ungroup()
data.df <- left_join(data.df , rnafold, by = "id") # add the sequence and structure to the forgi
data.df <- data.df %>%
mutate(stem_loop_seq = substr(sequence, L_limit, R_limit),
stem_loop_db = substr(mea_structure, L_limit, R_limit)) %>%
dplyr::select(-sequence, -mea_structure)
# write multi-fasta with seq and db
fasta <- character(nrow(data.df) * 3) # empty file
fasta[c(TRUE, FALSE, FALSE)] <- paste0(">", data.df$id)
fasta[c(FALSE, TRUE, FALSE)] <- data.df$stem_loop_seq
fasta[c(FALSE, FALSE, TRUE)] <- data.df$stem_loop_db
writeLines(fasta, paste0(prefix,"_rnaeval.fasta"))
mfe.df <- run_rnaeval(paste0(prefix,"_rnaeval.fasta"))
data.df <- left_join(data.df, mfe.df, by = "id")
forgi <- left_join(forgi, data.df, by = "id")
invisible(file.remove("temp.fa"))
return(forgi)
}# Outputs from mfe and forgi analyses
setwd("~/Documents/projects/computational_hiCLIP/notebooks")The working directory was changed to /Users/iosubi/Documents/projects/computational_hiCLIP/notebooks inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
nolinker.dt <- fread("stau1_3utr_minmfe_cluster_hybrids.mfe_analyses.tsv.gz") # analyse_nolinker_duplexes.R output
nolinker_forgi.dt <- fread("stau1_3utr_minmfe_cluster_hybrids.forgi.tsv.gz") # similar but for PARIS
paris.dt <- fread("paris_3utr_minmfe_cluster_hybrids.mfe_analyses.tsv")
paris.forgi.dt <- fread("paris_3utr_minmfe_cluster_hybrids.forgi.tsv.gz")
nornase.forgi.dt <- fread("stau1.peaks.10nt_10nt_forgi_analyses.df.txt")
rnafold.df <- read.csv("/Users/iosubi/Documents/projects/computational_hiCLIP/nonhybrids_10nt_10nt/stau1.rnafold.tsv.gz", sep = "\t")
rnafold.df <- rowid_to_column(rnafold.df, "id")
rnafold.df$id <- paste0("ID", rnafold.df$id, sep="")
rnafold.df <- dplyr::select(rnafold.df, c(id, sequence, mea_structure))# Annotations
genes.gr <- rtracklayer::import.gff2("/Users/iosubi/Documents/projects/computational_hiCLIP/files/human_GencodeV33.gtf.gz")
regions.gr <- import.gff2("~/Documents/Genomes/human/regions.gtf.gz")
# Clusters (Tosca)
hybrids.dt <- fread("/Users/iosubi/Documents/projects/computational_hiCLIP/files/all.clusters.tsv.gz")
paris_clusters.dt <- fread("/Users/iosubi/Documents/projects/computational_hiCLIP/files/paris/all.clusters.tsv.gz")|--------------------------------------------------|
|==================================================|
These analyses were performed on representative hybrids produced with the Tosca pipeline, as well as predicted duplexes using STAU1 peaks and RNAfold Only intramolecular 3’UTR duplexes were analysed. For the hybrid data, the hybrid with min. MFE was chosen to represent each cluster.
Sys.setenv(PATH="/Users/iosubi/opt/miniconda3/bin/")
nonhybrids.mfe.df <- get_mfe(nornase.forgi.dt, rnafold.df, prefix="stau1")
nonhybrids.mfe.df <- nonhybrids.mfe.df %>%
dplyr::select(id, eval_mfe) %>%
distinct()
nornase.forgi.dt <- left_join(nornase.forgi.dt, nonhybrids.mfe.df, by = "id")
nonhybrids.mfe.df <- nonhybrids.mfe.df %>%
mutate(Sample = "mfe", Experiment = "STAU1\nnonhybrids")
nonhybrids.mfe.df <- dplyr::rename(nonhybrids.mfe.df, MFE = eval_mfe)
nolinker.mfe.df <- nolinker.dt %>%
dplyr::select(id, mfe, shuffled_mfe) %>%
distinct()
nolinker.mfe.df <- nolinker.mfe.df %>%
gather("Sample", "MFE", mfe:shuffled_mfe) %>%
mutate(Experiment = "STAU1")paris.mfe.df <- paris.dt %>%
dplyr::select(id, mfe, shuffled_mfe) %>%
distinct()
paris.mfe.df <- paris.mfe.df %>%
gather("Sample", "MFE", mfe:shuffled_mfe) %>%
mutate(Experiment = "PARIS")stau1.hyb.nonhyb.dt <- bind_rows(nolinker.mfe.df, paris.mfe.df, nonhybrids.mfe.df)
stau1.hyb.nonhyb.dt <- stau1.hyb.nonhyb.dt %>%
mutate(sample = case_when((Sample == "mfe") ~ "STAU1",
(Sample == "shuffled_mfe") ~ "Shuffled control"))stau1.hyb.nonhyb.gg <- ggplot(stau1.hyb.nonhyb.dt, aes(x = MFE, linetype = Sample)) +
geom_density(alpha = 0.8) +
scale_colour_brewer(palette="Dark2") +
scale_linetype_manual(values=c( "solid", "longdash"))+
facet_grid(cols = vars(Experiment)) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
ylab("Density") +
xlab("MFE (kcal/mol)") +
ggtitle("3'UTR clusters min MFE")
stau1.hyb.nonhyb.gg# ggsave("3UTR_clusters_nolinker_minMFE.pdf", stau1.hyb.nonhyb.gg, dpi=300, height = 7, width = 8) # all
stau1.hyb.nonhyb.dt <- stau1.hyb.nonhyb.dt %>%
dplyr::filter(Sample != "Shuffled control")
mfe.violin.gg <- ggplot(stau1.hyb.nonhyb.dt, aes(x = Experiment, y = MFE, fill = Experiment)) +
geom_violin() +
geom_boxplot(width=0.1) +
scale_fill_brewer(palette="Blues") +
#scale_color_grey() +
#stat_summary(fun.y=median, geom="point", size=2, color="grey") +
theme(legend.position="right", legend.direction="vertical",
legend.title = element_blank()) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
ggtitle("3'UTR clusters min MFE")
mfe.violin.ggnolinker_features.df <- nolinker.dt %>%
dplyr::select(id, total_paired) %>%
ungroup()
max <- quantile(nolinker_features.df$total_paired, 0.90)[[1]]
min <- quantile(nolinker_features.df$total_paired, 0.2)[[1]]
median(nolinker_features.df$total_paired)[1] 23
filtered.nonhyb.forgi.df <- nornase.forgi.dt %>%
dplyr::filter(between(sum_paired, 8, max) )filtered.nonhyb.mfe.df <- filtered.nonhyb.forgi.df %>%
dplyr::select(id, eval_mfe) %>%
distinct() %>%
mutate(Sample = "mfe", Experiment = "STAU1 nonhybrids")
filtered.nonhyb.mfe.df <- dplyr::rename(filtered.nonhyb.mfe.df, MFE = eval_mfe)# Append datasets for all experiments
stau1.hyb.filtered_nonhyb.dt <- bind_rows(nolinker.mfe.df, paris.mfe.df, filtered.nonhyb.mfe.df)
stau1.hyb.filtered_nonhyb.dt <- stau1.hyb.filtered_nonhyb.dt %>%
mutate(sample = case_when((Sample == "mfe") ~ "STAU1",
(Sample == "shuffled_mfe") ~ "Shuffled control"))stau1.hyb.filtered_nonhyb.gg <- ggplot(stau1.hyb.filtered_nonhyb.dt, aes(x = MFE, linetype = Sample)) +
geom_density(alpha = 0.8) +
scale_colour_brewer(palette="Dark2") +
scale_linetype_manual(values=c( "solid", "longdash"))+
facet_grid(cols = vars(Experiment)) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
ylab("Density") +
xlab("MFE (kcal/mol)") +
ggtitle("3'UTR clusters min MFE - filtered")
stau1.hyb.filtered_nonhyb.gg# all
stau1.hyb.filtered_nonhyb.dt <- stau1.hyb.filtered_nonhyb.dt %>%
dplyr::filter(Sample != "Shuffled control")
mfe.filtered.violin.gg <- ggplot(stau1.hyb.filtered_nonhyb.dt, aes(x = Experiment, y = MFE, fill = Experiment)) +
geom_violin() +
geom_boxplot(width=0.1) +
scale_fill_brewer(palette="Blues") +
#scale_color_grey() +
#stat_summary(fun.y=median, geom="point", size=2, color="grey") +
theme(legend.position="right", legend.direction="vertical",
legend.title = element_blank()) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
ggtitle("3'UTR clusters min MFE")
mfe.filtered.violin.gg
ggsave("filtered_mfe_comparison.pdf", mfe.filtered.violin.gg, dpi = 300)Saving 7.29 x 4.51 in image
# No linker
nolinker.feat.df <- nolinker_forgi.dt %>% replace(is.na(.), 0) # replqce NA with zero to simplify filtering
nolinker.feat.df <- nolinker.feat.df %>%
group_by(id) %>%
mutate(stem_type = case_when(!any(element_type == "i") ~ "Uninterrupted stem",
any(element_type == "i") & (!any(element_type == "i" & (L_width > 1 | R_width > 1))) &
(!any(element_type == "i" & L_width != R_width)) ~ "Stem with only symmetrical bulges",
(any(element_type == "i")) & (!any(element_type == "i" & (L_width > 1 | R_width > 1))) &
(any(element_type == "i" & L_width != R_width)) ~ "Stem with non-symmetrical bulges",
(any(element_type == "i")) &
(any(element_type == "i" & (L_width > 1) & (R_width == 0 | R_width >= 1))) |
(any(element_type == "i" & (L_width = 1) & (R_width > 1))) ~ "Stem with internal loops"))
nolinker_pie.gg <- draw_pie_chart(nolinker.feat.df, stem_type)
ggsave(paste0("nolinker", "_stem_types_pie.pdf"), nolinker_pie.gg, height = 11, width = 7, dpi = 300)
nolinker_pie.gg + ggtitle("STAU1 hybrids")
# PARIS
paris.feat.df <- paris.forgi.dt %>% replace(is.na(.), 0) # replqce NA with zero to simplify filtering
paris.feat.df <- paris.feat.df %>%
group_by(id) %>%
mutate(stem_type = case_when(!any(element_type == "i") ~ "Uninterrupted stem",
any(element_type == "i") & (!any(element_type == "i" & (L_width > 1 | R_width > 1))) &
(!any(element_type == "i" & L_width != R_width)) ~ "Stem with only symmetrical bulges",
(any(element_type == "i")) & (!any(element_type == "i" & (L_width > 1 | R_width > 1))) &
(any(element_type == "i" & L_width != R_width)) ~ "Stem with non-symmetrical bulges",
(any(element_type == "i")) &
(any(element_type == "i" & (L_width > 1) & (R_width == 0 | R_width >= 1))) |
(any(element_type == "i" & (L_width = 1) & (R_width > 1))) ~ "Stem with internal loops"))
paris_pie.gg <- draw_pie_chart(paris.feat.df, stem_type)
ggsave(paste0("paris", "_stem_types_pie.pdf"), nolinker_pie.gg, height = 11, width = 7, dpi = 300)paris_pie.gg + ggtitle("PARIS hybrids")
# Nonhybrids
nornase.forgi.gg <- draw_pie_chart(nornase.forgi.dt, stem_type)
ggsave("non_hyb_stem_types_pie.pdf", nornase.forgi.gg, dpi = 300, height = 11, width = 7)nornase.forgi.gg + ggtitle("STAU1 nonhybrids")
filtered.nonhyb.forgi.gg <- draw_pie_chart(filtered.nonhyb.forgi.df, stem_type)
ggsave("filtered_non_hyb_stem_types_pie.pdf", filtered.nonhyb.forgi.gg, dpi = 300, height = 11, width = 7)filtered.nonhyb.forgi.gg + ggtitle(paste0("STAU1 nonhybrids ", 8, "-", max," bp long"))# paris.dt$total_paired
nolinker.feat.df <- analyse_duplexes(nolinker.feat.df)Joining, by = "id"
paris.feat.df <- analyse_duplexes(paris.feat.df)Joining, by = "id"
nolinker_features.df <- get_id_features(nolinker.feat.df)
nonhyb_features.df <- get_id_features(nornase.forgi.dt)
filtered_nonhyb_features.df <- get_id_features(filtered.nonhyb.forgi.df)
paris_features.df <- get_id_features(paris.feat.df)
filtered_nonhyb_features.df$Experiment <- "STAU1 nonhybrids (filtered)"
nolinker_features.df$Experiment <- "STAU1 hybrids"
nonhyb_features.df$Experiment <- "STAU1 nonhybrids"
paris_features.df$Experiment <- "PARIS"
all.features.df <- bind_rows(nolinker_features.df, filtered_nonhyb_features.df, nonhyb_features.df, paris_features.df)all.features.df <- rowid_to_column(all.features.df)
all.features.df <- all.features.df %>%
gather("feature", "counts", max_duplex:iloop_counts)all_features.gg <- ggplot(transform(all.features.df, feature = factor(feature, levels = c("sum_paired", "max_duplex","iloop_counts"))),
aes(x = "", y=counts, fill=Experiment)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 90)) +
scale_fill_brewer(palette="Blues") +
facet_wrap(. ~ feature, scales = "free") +
theme(legend.position="right", legend.direction="vertical")
all_features.gg# ggsave("feature_comparison_boxplot.pdf", all_features.gg, dpi = 300)paris.dt <- paris.dt %>%
dplyr::select(-cluster_name)nolinker_nuc_freq.dfnolinker_nuc_freq.df <- get_nucleotide_frequency(nolinker.dt)New names:
* A -> A...41
* C -> C...42
* G -> G...43
* T -> T...44
* A -> A...45
* ...
nolinker_nuc_freq.gg <- plot_nucleotide_frequency(nolinker_nuc_freq.df)
paris_nuc_freq.df <- get_nucleotide_frequency(paris.dt)New names:
* A -> A...41
* C -> C...42
* G -> G...43
* T -> T...44
* A -> A...45
* ...
paris_nuc_freq.gg <- plot_nucleotide_frequency(paris_nuc_freq.df)
nonnhyb_nuc_freq.gg <- plot_nucleotide_frequency(filtered.nonhyb.forgi.df)
nolinker_nuc_freq.gg + ggtitle("STAU1")nonnhyb_nuc_freq.gg + ggtitle("STAU1 nonhybrids")paris_nuc_freq.gg + ggtitle("PARIS")These analyses were performed on the clustered hybrids produced with the Tosca pipeline.
hybrids.dt <- hybrids.dt[grep("^rRNA", L_seqnames, invert = TRUE)] # Remove rRNA
hybrids.dt <- hybrids.dt[grep("Mt", L_seqnames, invert = TRUE)]
# Reformat names of hybrids seqnames so they match the annotation gtf
hybrids.dt[, c("L_seqnames") := tstrsplit(L_seqnames, "::", fixed=TRUE)[1]]
hybrids.dt[, c("R_seqnames") := tstrsplit(L_seqnames, "::", fixed=TRUE)[1]]
# Convert to genomic coordinates
hybrids.dt <- convert_coordinates(hybrids.dt, genes.gr)
fwrite(hybrids.dt, "clusters.gc.tsv.gz", sep = "\t")
# Annotate hybrids
hybrids.dt <- annotate_hybrids(hybrids.dt, regions.gr)
fwrite(hybrids.dt, "annotated_clusters.gc.tsv.gz", sep = "\t")
hybrids.dt$Experiment <- "STAU1 - No linker"
paris_clusters.dt <- paris_clusters.dt[grep("^rRNA", L_seqnames, invert = TRUE)] # Remove rRNA
paris_clusters.dt <- paris_clusters.dt[grep("Mt", L_seqnames, invert = TRUE)]
paris_clusters.dt[, c("L_seqnames") := tstrsplit(L_seqnames, "::", fixed=TRUE)[1]]
paris_clusters.dt[, c("R_seqnames") := tstrsplit(L_seqnames, "::", fixed=TRUE)[1]]
# Convert to genomic coordinates
paris_clusters.dt <- convert_coordinates(paris_clusters.dt, genes.gr)
fwrite(paris_clusters.dt, "paris_clusters.gc.tsv.gz", sep = "\t")
# Annotate hybrids
paris_clusters.dt <- annotate_hybrids(paris_clusters.dt, regions.gr)
fwrite(paris_clusters.dt, "paris_annotated_clusters.gc.tsv.gz", sep = "\t")
paris_clusters.dt$Experiment <- "PARIS"# add col cluster_name because PARIS data does not have unique cluster identifiers
paris_clusters.dt <- paris_clusters.dt %>% unite("cluster_name", c(name, L_seqnames),
na.rm = TRUE, remove = FALSE, sep="_")
hybrids.dt <- hybrids.dt %>% unite("cluster_name", c(name, L_seqnames),
na.rm = TRUE, remove = FALSE, sep="_")all_exp.df <- as.data.frame(rbind(hybrids.dt, paris_clusters.dt))
# intramolecular only
all_exp.df <- all_exp.df %>% dplyr::filter(L_gene_name == R_gene_name)biotype_counts.df <- all_exp.df %>%
dplyr::select(cluster_name, Experiment, L_biotype, R_biotype) %>%
pivot_longer(c(L_biotype, R_biotype), names_to = "arm", values_to = "biotype")
biotype_counts.df <- biotype_counts.df %>%
group_by(Experiment, biotype) %>%
mutate(biotype = case_when(str_detect(biotype, "lncRNA") ~ "lncRNA",
TRUE ~ biotype)) %>%
summarise(counts = n()) %>%
arrange(desc(biotype)) %>%
mutate(percentage = counts*100 / sum(counts)) %>%
ungroup()
biotype_counts.df <- biotype_counts.df %>%
mutate(RNA_biotype = case_when((percentage < 0.05) ~ "Other",
(percentage >= 0.05) ~ biotype))
biotype_counts.df <- biotype_counts.df %>%
group_by(Experiment, RNA_biotype) %>%
mutate(biotype_percentage = sum(percentage)) %>%
ungroup()# Simple bar-chart
ggplot(distinct(biotype_counts.df, across(c(Experiment, RNA_biotype, biotype_percentage))),
aes(RNA_biotype, biotype_percentage)) +
geom_bar(position="dodge", stat = "identity", aes(fill = Experiment), alpha = 0.8) +
scale_fill_brewer(palette="Dark2") +
#scale_fill_paletteer_d("rcartocolor::Earth", direction = -1) +
theme(legend.position="right", legend.direction="vertical",
legend.title = element_blank()) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ylab("Percentage") +
xlab("RNA biotype")all_exp_mRNA.df <- all_exp.df %>% dplyr::filter(R_biotype == "mRNA" & L_biotype == "mRNA")
all_exp_mRNA.df <- all_exp_mRNA.df %>%
mutate(hybrid_type = case_when((L_region == "UTR3" & R_region == "UTR3") ~ "3'UTR-3'UTR",
(L_region == "CDS" & R_region == "CDS") ~ "CDS-CDS",
(L_region == "UTR3" & R_region == "CDS") ~ "3'UTR-CDS",
(L_region == "CDS" & R_region == "UTR3") ~ "3'UTR-CDS",
(L_region == "UTR5" & R_region == "UTR5") ~ "5'UTR-5'UTR",
(L_region == "UTR3" & R_region == "UTR5") ~ "3'UTR-5'UTR",
(L_region == "UTR5" & R_region == "UTR3") ~ "3'UTR-5'UTR",
(L_region == "UTR5" & R_region == "CDS") ~ "5'UTR-CDS",
(L_region == "CDS" & R_region == "UTR5") ~ "5'UTR-CDS"))
plot_stacked_barchart(all_exp_mRNA.df, hybrid_type, Experiment)# Get hybrid spans; Get hybrid arm widths
threeutr.df <- all_exp_mRNA.df %>%
dplyr::filter(L_region == "UTR3" & R_region == "UTR3")
threeutr.df <- threeutr.df %>%
rowwise() %>%
mutate(hybrid_span = R_start - L_end + 1,
L_arm_width = L_end - L_start + 1,
R_arm_width = R_end - R_start + 1)ggplot(threeutr.df, aes(x = hybrid_span, color = Experiment)) +
geom_density(alpha = 0.8) +
scale_colour_brewer(palette="Dark2") +
scale_x_log10() + annotation_logticks() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
ylab("Density") +
xlab("Hybrid span")# convert to long format
threeutr_long.df <- threeutr.df %>%
pivot_longer(c(R_arm_width, L_arm_width), names_to = "arm", values_to = "arm_width")
ggplot(threeutr_long.df, aes(x = arm, y = arm_width, fill = Experiment)) +
geom_boxplot(alpha = 0.7) +
theme(axis.text.x = element_text(angle = 90)) +
scale_fill_brewer(palette="Dark2") +
theme(legend.position="right", legend.direction="vertical") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
ylab("Hybrid arm width") +
xlab("Arm")